Import der Daten und Vorverarbeitung

In [1]:
# import relevant modules
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime, timedelta
import plotly.express as px
from plotly.subplots import make_subplots
from tqdm.notebook import tqdm
import plotly.graph_objects as go
import glob
import sys
sys.path.append('../scripts/')
from analysis import get_correlation, peak_analysis, peak_ranges
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
In [2]:
# load only hashtags which are relevant
topics_df = pd.read_json('../../data/BTW17_Twitter/lda/hashtag_topics.json')
hashtags = topics_df['hashtag'].tolist()
In [3]:
# load hashtag timeseries
hashtag_df = pd.read_json('../../data/BTW17_Twitter/hashtags/hashtag_counts.json')
hashtag_df.head(3)
Out[3]:
date hashtag count
0 2017-05-29 150jahrekapital 1
1 2017-05-29 a19 1
2 2017-05-29 abschiebung 14
In [4]:
# load politicans metadata and keep only relevant data
persons_df = pd.read_csv('../../data/BTW17_Suggestions/btw_politicians_demographic.csv')
persons_df.drop(columns=['Unnamed: 0', 'Born', 'Bundesland', 'Age'], inplace=True)
persons_df['Name'] = persons_df['Name'].apply(lambda x: x.lower())
persons_df.rename(columns={'Name':'queryterm', 'Party':'party', 'Gender':'gender'}, inplace=True)
persons_df.head(3)
Out[4]:
queryterm party gender
0 wolfgang stefinger CSU male
1 kai whittaker CDU male
2 katrin albsteiger CSU female
In [5]:
cluster_cat = pd.read_csv('../../data/BTW17_Suggestions/suggestions/cluster_categories.csv', delimiter=',')
cluster_cat.drop(columns='Unnamed: 0', inplace=True)
cluster_cat['size'] = cluster_cat['sugg'].apply(lambda x: x.count(', ')+1)
cluster_cat.head(3)
Out[5]:
cluster category sugg size
0 -1 Rauschen büro lorenz caffier, peter uldall juhl, cloud ... 6217
1 0 Rauschen gebrochen, stadt land fluss, konzert für dich,... 346
2 1 Personen sabine zeidler, birga köhler, rosemarie heinem... 256
In [6]:
# load suggestions timeseries
tmp = pd.read_parquet('../../data/BTW17_Suggestions/processed/suggestions.parquet')
tmp['date'] = pd.to_datetime(tmp['date']).dt.date
suggestions_df = pd.DataFrame()
suggestions_df[['date', 'queryterm', 'suggestion', 'count']] = tmp.groupby(['date', 'queryterm', 'suggestion'], as_index=False).count()
suggestions_df = suggestions_df.merge(persons_df, how='left', on='queryterm')
In [7]:
# load vector similarites
similarity_df = pd.read_json('../../data/BTW17_Suggestions/suggestions/vector_similarity.json')
similarity_df['hashtags'] = [hashtags for i in similarity_df.index]
similarity_df['suggestion'] = similarity_df['suggestion'].apply(lambda x: ' '.join(x))
In [8]:
# join suggestion cluster and  group again
suggestions_df = suggestions_df.merge(similarity_df, how='inner', on='suggestion')
suggestions_df = suggestions_df.groupby(['date', 'queryterm', 'party', 'gender', 'cluster'], as_index=False).sum('count')
suggestions_df.head(3)
Out[8]:
date queryterm party gender cluster count
0 2017-05-29 achim post SPD male 2 4
1 2017-05-29 achim post SPD male 5 12
2 2017-05-29 achim post SPD male 75 4
In [9]:
# remodel similarity cluster to hashtags
similarity_df = similarity_df.set_index(['suggestion', 'cluster']).apply(pd.Series.explode).reset_index()
similarity_df['similarity_scores'] = pd.to_numeric(similarity_df['similarity_scores']) 
similarity_df = similarity_df.groupby(['cluster', 'hashtags'], as_index=False).mean('similarity_scores')
similarity_df = similarity_df.merge(cluster_cat, how='left', on='cluster')

# filter out category rauschen
similarity_df = similarity_df[similarity_df['category']!='Rauschen'].reset_index(drop=True)
similarity_df.head(3)
Out[9]:
cluster hashtags similarity_scores category sugg size
0 1 afdwählen 0.008258 Personen sabine zeidler, birga köhler, rosemarie heinem... 256
1 1 afghanistan -0.011473 Personen sabine zeidler, birga köhler, rosemarie heinem... 256
2 1 altersarmut -0.008137 Personen sabine zeidler, birga köhler, rosemarie heinem... 256
In [10]:
# prepare data for tlcc

# filter everything with sim_score < 0.5
sim_df = similarity_df[similarity_df['similarity_scores']>=0.5].reset_index(drop=True)

# group suggestions to cluster
cluster_df = suggestions_df.groupby(['date', 'cluster'], as_index=False).sum('count')
cluster_df.rename(columns={'count':'cluster_count'}, inplace=True)

# group suggestions per cluster and party
cluster_party_df = suggestions_df.groupby(['date', 'party', 'cluster'], as_index=False).sum('count')
cluster_party_df.rename(columns={'count':'cluster_count'}, inplace=True)

# group suggestions per cluster and gender
cluster_gender_df = suggestions_df.groupby(['date', 'gender', 'cluster'], as_index=False).sum('count')
cluster_gender_df.rename(columns={'count':'cluster_count'}, inplace=True)

hashtag_df.rename(columns={'count':'hashtag_count'}, inplace=True)
In [11]:
colors = px.colors.qualitative.Antique
colors.extend(px.colors.qualitative.Antique)

Time Lagged Cross Correlation

In [12]:
delays = []
for i in range(0, 71, 7):
    delays.append(i)
In [13]:
#dfs = []
#for i in delays:
#    dfs.append(get_correlation(i, hashtag_df, cluster_df, cluster_gender_df, cluster_party_df, sim_df))
In [14]:
#for i in range(len(dfs)):
#    dfs[i].to_json(f'../../data/Analysis/df_{delays[i]}_delays.json')
In [15]:
# set to *.json to load all
input_loc = '../../data/Analysis/*delays.json'
input_files = glob.glob(input_loc)

dfs = []
for file in input_files:
    data = pd.read_json(file)
    data = data.merge(cluster_cat, how='left', on='cluster')
    #data = data[(data['pearsonr']>=0)&(data['p_value']<=0.05)&(data['gender']=='all')&(data['party']=='all')]
    data = data[(data['pearsonr']>=0)]
    dfs.append(data)

Deskriptives

In [211]:
hashtag_df
Out[211]:
date hashtag hashtag_count
0 2017-05-29 150jahrekapital 1
1 2017-05-29 a19 1
2 2017-05-29 abschiebung 14
3 2017-05-29 abschiebungen 1
4 2017-05-29 afd 805
... ... ... ...
99165 2017-09-25 westerwald 2
99166 2017-09-25 wählerwanderung 4
99167 2017-09-25 zeit 2
99168 2017-09-25 zionfaschos 1
99169 2017-09-25 öpp 1

99170 rows × 3 columns

In [16]:
print(f'Anzahl möglicher Kombinationen: {len(similarity_df[similarity_df["category"]!="Rauschen"])}')
print(f'Anzahl relevanter Kombinationen: {len(sim_df)}')
print(f'Anzahl Kombinationen pro Hashtag: {len(sim_df)/sim_df["hashtags"].nunique()}')
print(f'Anteil relevanter Kombinationen: {round(len(sim_df[sim_df["category"]!="Rauschen"])/len(similarity_df[similarity_df["category"]!="Rauschen"])*100,2)}%')
Anzahl möglicher Kombinationen: 114696
Anzahl relevanter Kombinationen: 1050
Anzahl Kombinationen pro Hashtag: 6.481481481481482
Anteil relevanter Kombinationen: 0.92%

Kategorien der Suchvorschlag Cluster

In [17]:
for category in sim_df['category'].unique():
    tmp = sim_df[sim_df['category']==category]
    print(f'Kategorie: {category}, Anzahl relevanter Kombinationen: {tmp.groupby(["cluster", "hashtags"], as_index=False).ngroups}')
Kategorie: Personen, Anzahl relevanter Kombinationen: 690
Kategorie: Orte, Anzahl relevanter Kombinationen: 41
Kategorie: Politik, Anzahl relevanter Kombinationen: 108
Kategorie: Medizin, Anzahl relevanter Kombinationen: 12
Kategorie: Organisationen, Anzahl relevanter Kombinationen: 18
Kategorie: Medien, Anzahl relevanter Kombinationen: 25
Kategorie: Wirtschaft, Anzahl relevanter Kombinationen: 52
Kategorie: Berufe, Anzahl relevanter Kombinationen: 104
In [18]:
sim_df.groupby('category', as_index=False)['similarity_scores'].mean()
Out[18]:
category similarity_scores
0 Berufe 0.720236
1 Medien 0.608040
2 Medizin 0.511738
3 Organisationen 0.572786
4 Orte 0.596317
5 Personen 0.588212
6 Politik 0.568747
7 Wirtschaft 0.578236
In [19]:
# load cluster_df and join categories
cluster_cat_df = pd.read_json('../../data/BTW17_Suggestions/suggestions/cluster.json')
cluster_cat_df = cluster_cat_df.merge(cluster_cat, how='left', on='cluster')

tmp = pd.DataFrame()
tmp['cluster'] = cluster_cat_df['cluster'].value_counts().index
tmp['Clustergröße'] = cluster_cat_df['cluster'].value_counts().values
tmp = tmp.merge(cluster_cat[['cluster', 'category']], how='left', on='cluster')
tmp = tmp[tmp['category']!='Rauschen']
tmp2 = cluster_cat_df.groupby('category', as_index=False)['cluster'].nunique().sort_values(by='cluster', ascending=False)
tmp2.rename(columns={'cluster': 'n_cluster'}, inplace=True)
tmp = tmp.merge(tmp2, on='category')

tmp.rename(columns={'category':'Kategorie', 'cluster':'Cluster', 'n_cluster':'Anzahl Cluster'}, inplace=True)
In [20]:
tmp.groupby('Kategorie', as_index=False).mean()
Out[20]:
Kategorie Cluster Clustergröße Anzahl Cluster
0 Berufe 354.888889 19.555556 18.0
1 Justiz 279.000000 27.125000 8.0
2 Medien 359.818182 16.090909 22.0
3 Medizin 400.571429 16.142857 7.0
4 Organisationen 409.904762 18.666667 21.0
5 Orte 336.730769 31.410256 78.0
6 Personen 388.914081 22.069212 419.0
7 Politik 368.113636 23.784091 88.0
8 Privatleben 456.833333 23.916667 12.0
9 Wirtschaft 355.428571 23.142857 35.0
In [21]:
fig = px.scatter(cluster_cat_df, x='t-SNE(x)', y='t-SNE(y)', color='category', hover_name='suggestion',
                 template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()

Regressions-Analysen unterschiedlicher Dimensionen (Einflüsse auf die mittleren Similarity Scores)

In [22]:
# regressionsanalyse
reg_df = suggestions_df.groupby(['party', 'gender', 'cluster'], as_index=False).sum()
tmp = similarity_df.groupby(['cluster', 'category'], as_index=False).mean()
reg_df = reg_df.merge(tmp, how='left', on='cluster')
reg_df.dropna(inplace=True)
reg_df = reg_df.reset_index(drop=True)
reg_df.head(3)
Out[22]:
party gender cluster count category similarity_scores size
0 AFD female 2 534 Wirtschaft 0.038410 20.0
1 AFD female 9 428 Orte 0.013585 225.0
2 AFD female 13 25 Wirtschaft 0.015126 306.0
In [23]:
reg = smf.ols('similarity_scores ~ size + C(party) + C(gender) + C(category)', data=reg_df).fit()
reg.summary()
Out[23]:
OLS Regression Results
Dep. Variable: similarity_scores R-squared: 0.052
Model: OLS Adj. R-squared: 0.045
Method: Least Squares F-statistic: 7.234
Date: Mon, 24 Jan 2022 Prob (F-statistic): 2.83e-20
Time: 08:57:47 Log-Likelihood: 5575.9
No. Observations: 2656 AIC: -1.111e+04
Df Residuals: 2635 BIC: -1.099e+04
Df Model: 20
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept 0.0116 0.005 2.550 0.011 0.003 0.021
C(party)[T.CDU] -0.0038 0.003 -1.226 0.220 -0.010 0.002
C(party)[T.CSU] -0.0033 0.003 -0.991 0.322 -0.010 0.003
C(party)[T.DIE LINKE] -0.0039 0.003 -1.175 0.240 -0.010 0.003
C(party)[T.FDP] -5.014e-06 0.004 -0.001 0.999 -0.008 0.008
C(party)[T.GRÜNE] -0.0023 0.003 -0.719 0.472 -0.009 0.004
C(party)[T.Parteilos] -0.0012 0.005 -0.249 0.803 -0.011 0.008
C(party)[T.SPD] -0.0029 0.003 -0.950 0.342 -0.009 0.003
C(party)[T.SSW] 0.0017 0.012 0.143 0.886 -0.021 0.025
C(party)[T.fraktionslos] -0.0030 0.007 -0.442 0.658 -0.017 0.010
C(gender)[T.male] 0.0004 0.001 0.316 0.752 -0.002 0.003
C(category)[T.Justiz] -0.0073 0.006 -1.284 0.199 -0.018 0.004
C(category)[T.Medien] -0.0310 0.004 -7.146 0.000 -0.039 -0.022
C(category)[T.Medizin] -0.0205 0.006 -3.550 0.000 -0.032 -0.009
C(category)[T.Organisationen] -0.0047 0.004 -1.090 0.276 -0.013 0.004
C(category)[T.Orte] -0.0101 0.004 -2.805 0.005 -0.017 -0.003
C(category)[T.Personen] -0.0042 0.003 -1.197 0.231 -0.011 0.003
C(category)[T.Politik] -0.0035 0.004 -0.946 0.344 -0.011 0.004
C(category)[T.Privatleben] 0.0051 0.005 1.131 0.258 -0.004 0.014
C(category)[T.Wirtschaft] 0.0031 0.004 0.748 0.454 -0.005 0.011
size -6.719e-06 1.25e-05 -0.539 0.590 -3.12e-05 1.77e-05
Omnibus: 97.716 Durbin-Watson: 2.018
Prob(Omnibus): 0.000 Jarque-Bera (JB): 265.080
Skew: -0.109 Prob(JB): 2.75e-58
Kurtosis: 4.532 Cond. No. 1.25e+03


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.25e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

Ergebnisse der TLCC

Fragestellung: Wie lange dauert die Durchdringung im Durchschnitt und nach den jeweiligen Dimensionen? Messung: TLCC mit Pearson R und p-Wert

Betrachtung über alle Kombinationen

In [24]:
# übersicht der korrelationen und deren p-werte pro time lag
delay_list = []
r_list = []
p_list = []

for i in range(len(dfs)):
    delay_list.append(int(delays[i]/7))
    df = dfs[i][(dfs[i]['gender']=='all')&(dfs[i]['party']=='all')]
    r_list.append(round(df['pearsonr'].mean(),3))
    p_values = df['p_value'].to_numpy()
    p_list.append(round(stats.combine_pvalues(p_values)[1],3))
    
tmp = pd.DataFrame(data={'Time Lag (in Wochen)': delay_list, 'Pearson R': r_list, 'P-Wert': p_list})
tmp
Out[24]:
Time Lag (in Wochen) Pearson R P-Wert
0 0 0.120 0.0
1 1 0.122 0.0
2 2 0.119 0.0
3 3 0.131 0.0
4 4 0.141 0.0
5 5 0.131 0.0
6 6 0.161 0.0
7 7 0.170 0.0
8 8 0.163 0.0
9 9 0.171 0.0
10 10 0.116 0.0

Sämtliche Korrelationen sind signifikant (p<0.05), deshalb Betrachtung im Plot.

In [25]:
fig = px.line(tmp, x='Time Lag (in Wochen)', y='Pearson R',
              template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()

Plateau zwischen 6-9 Wochen, allerdings sehr geringe Korrelation. Ausnahmen bei 9 Wochen sind nur wenige:

In [115]:
tmp = dfs[9][(dfs[9]['gender']=='all')&(dfs[9]['party']=='all')]
tmp = tmp[tmp['pearsonr']>=0.5]
tmp.sort_values(by='pearsonr', ascending=False)[['cluster', 'hashtags', 'category_x', 'pearsonr', 'similarity_scores']]
Out[115]:
cluster hashtags category_x pearsonr similarity_scores
9814 711 bureg Personen 0.735616 0.675571
2482 387 btw2017 Personen 0.666920 0.660750
4679 505 btw2017 Personen 0.651856 0.512437
2508 387 bundestagswahl Personen 0.627181 0.692417
4705 505 bundestagswahl Personen 0.622913 0.554438
7513 620 btw Personen 0.589454 0.544000
7656 620 linke Personen 0.542172 0.563857
2950 387 traudichdeutschland Personen 0.541448 0.603667
2534 387 darumgrün Personen 0.539766 0.552000
4380 490 populismus Organisationen 0.533656 0.564286
7422 594 steineke Personen 0.513590 0.539500
2469 387 btw17 Personen 0.505943 0.603667
8397 671 islamisierung Orte 0.503226 0.658875

Betrachtung nach Kategorie der Cluster

In [189]:
delay_list = []
categories = []
r_list = []
p_list = []

for i in range(len(dfs)):
    for category in set(similarity_df['category']):
        delay_list.append(delays[i])
        df = dfs[i][(dfs[i]['gender']=='all')&(dfs[i]['party']=='all')]
        categories.append(category)
        r_list.append(round(df[df['category_x']==category]['pearsonr'].mean(),3))
        p_values = df[df['category_x']==category]['p_value'].to_numpy()
        p_list.append(round(stats.combine_pvalues(p_values)[1], 3))

plot_cat = pd.DataFrame(data={'Delay': delay_list, 'Kategorie': categories, 'Pearson R': r_list, 'P-Wert': p_list})
plot_cat = plot_cat.dropna()
plot_cat = plot_cat.reset_index(drop=True)
plot_cat[plot_cat['Kategorie']=='Wirtschaft']
Out[189]:
Delay Kategorie Pearson R P-Wert
1 0 Wirtschaft 0.116 0.046
8 7 Wirtschaft 0.110 0.191
15 14 Wirtschaft 0.100 0.621
22 21 Wirtschaft 0.151 0.001
29 28 Wirtschaft 0.144 0.004
36 35 Wirtschaft 0.181 0.000
43 42 Wirtschaft 0.267 0.000
50 49 Wirtschaft 0.270 0.000
57 56 Wirtschaft 0.236 0.000
64 63 Wirtschaft 0.169 0.000
71 70 Wirtschaft 0.120 0.083
In [191]:
delay_list = []
gender_list = []
r_list = []
p_list = []

for i in range(len(dfs)):
    for gender in set(suggestions_df['gender']):
        delay_list.append(delays[i])
        df = dfs[i][(dfs[i]['gender']!='all')&(dfs[i]['party']=='all')]
        gender_list.append(gender)
        df = df[df['category_x']!='Rauschen']
        r_list.append(round(df[df['gender']==gender]['pearsonr'].mean(),3))
        p_values = df[df['gender']==gender]['p_value'].to_numpy()
        p_list.append(round(stats.combine_pvalues(p_values)[1],3))

plot_gender = pd.DataFrame(data={'Delay': delay_list, 'Geschlecht': gender_list, 'Pearson R': r_list, 'P-Wert': p_list})
plot_gender = plot_gender.dropna()
plot_gender[plot_gender['Geschlecht']=='male']
Out[191]:
Delay Geschlecht Pearson R P-Wert
0 0 male 0.141 0.0
2 7 male 0.135 0.0
4 14 male 0.133 0.0
6 21 male 0.129 0.0
8 28 male 0.133 0.0
10 35 male 0.124 0.0
12 42 male 0.158 0.0
14 49 male 0.169 0.0
16 56 male 0.175 0.0
18 63 male 0.182 0.0
20 70 male 0.125 0.0
In [207]:
party_colors = ['rgb(0,158,224)', #afd
                'rgb(50,48,46)', #cdu
                'rgb(0,128,201)', #csu
                'rgb(182,28,62)', #dielinke
                'rgb(255,237,0)', #fdp
                'rgb(70,150,43)', #grüne
                'rgb(203,166,115)', #parteilos
                'rgb(227,0,15)', #spd
                'rgb(173,185,202)'# fraktionslos
               ]
In [208]:
delay_list = []
party_list = []
r_list = []
p_list = []

for i in range(len(dfs)):
    for party in set(suggestions_df['party']):
        delay_list.append(delays[i])
        df = dfs[i][(dfs[i]['gender']=='all')&(dfs[i]['party']!='all')]
        party_list.append(party)
        df = df[df['category_x']!='Rauschen']
        r_list.append(round(df[df['party']==party]['pearsonr'].mean(),3))
        p_values = df[df['party']==party]['p_value'].to_numpy()
        p_list.append(round(stats.combine_pvalues(p_values)[1],3))

plot_party = pd.DataFrame(data={'Delay': delay_list, 'Partei': party_list, 'Pearson R': r_list, 'P-Wert': p_list})
plot_party = plot_party.dropna()
plot_party = plot_party[plot_party['P-Wert']<0.05]
plot_party['Time Lag (in Wochen)'] = plot_party['Delay'] / 7
plot_party = plot_party.sort_values(by=['Partei', 'Delay'], ascending=True)

fig = px.line(plot_party, x='Time Lag (in Wochen)', y='Pearson R', color='Partei',
              template='simple_white', color_discrete_sequence=party_colors)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()
In [194]:
for x in plot_party['Partei'].unique():
    print(x)
SPD
CDU
AFD
DIE LINKE
GRÜNE
CSU
FDP
fraktionslos
Parteilos

t-Tests der Mittelwerte der Tage um die Peaks

In [30]:
peaks_df = pd.read_json('../../data/BTW17_Twitter/peaks/peak_dates.json')
peaks_df['num_peaks'] = peaks_df.apply(lambda x: len(x['lda_dates']) / 7, axis=1)
peaks_df[['peak_start', 'peak_end']] = peaks_df.apply(peak_ranges, axis=1)
peaks_df.drop(columns=['index', 'num_peaks', 'lda_dates'], inplace=True)
peaks_df = peaks_df.set_index(['hashtag']).apply(pd.Series.explode).reset_index()
peaks_df.head(3)
Out[30]:
hashtag peak_start peak_end
0 afghanistan 2017-05-29 2017-06-04
1 afghanistan 2017-08-22 2017-08-28
2 armut 2017-07-03 2017-07-09
In [31]:
cluster_ts_df = suggestions_df.groupby(['date', 'cluster', 'party', 'gender'], as_index=False).sum('count')
cluster_ts_df = cluster_ts_df.merge(cluster_cat[['cluster', 'category']], how='left', on='cluster')
cluster_ts_df.head(3)
Out[31]:
date cluster party gender count category
0 2017-05-29 0 AFD female 16 Rauschen
1 2017-05-29 0 AFD male 97 Rauschen
2 2017-05-29 0 CDU female 532 Rauschen
In [32]:
#analysis_dfs = []
#
#for i in tqdm(range(len(delays[1:]))):
#    test_range = delays[i+1]
#    tmp = peak_analysis(test_range, sim_df, peaks_df, cluster_ts_df)
#    tmp = tmp.reset_index(drop=True)
#    analysis_dfs.append(tmp)
In [33]:
# save files
#for i in range(len(analysis_dfs)):
#    analysis_dfs[i].to_json(f'../../data/Analysis/peak_analysis_detail_range_{delays[i]}.json')
In [34]:
# set to *.json to load all
input_loc = '../../data/Analysis/peak_analysis*.json'
input_files = glob.glob(input_loc)

analysis_dfs = []
for file in input_files:
    data = pd.read_json(file)
    analysis_dfs.append(data)
In [46]:
# print ttest results for 1, 5 and 9 weeks
for i in range(len(analysis_dfs)):
    tmp = analysis_dfs[i]
    if tmp['test_range'].mean() in [7,35,63]:
        test_range = tmp['test_range'].unique()
        a = tmp[tmp['time']=='after']['count']
        b = tmp[tmp['time']=='before']['count']
        results = stats.ttest_ind(a,b, equal_var=False)
        print(tmp.groupby('time', as_index=False).mean()[['time', 'count']])
        print(f'Test Range: {test_range}, t: {results[0]}, p: {results[1]}\n')
     time      count
0   after  15.483540
1  before  15.471336
Test Range: [7], t: 0.06244940490777691, p: 0.9502054819630756

     time      count
0   after  14.986474
1  before  15.018879
Test Range: [35], t: -0.1730405467201768, p: 0.8626209569115229

     time      count
0   after  14.694571
1  before  14.738089
Test Range: [63], t: -0.23856538878544817, p: 0.8114444970951588

In [48]:
# aggregate dfs for 1, 5 and 9 weeks
df = pd.DataFrame()
for i in range(len(analysis_dfs)):
    tmp = analysis_dfs[i]
    if tmp['test_range'].mean() in [7,35,63]:
        df = pd.concat([df, tmp])
In [106]:
before = df[df['time']=='before']
before.drop(columns='time', inplace=True)
before.rename(columns={'count':'before'}, inplace=True)

after = df[df['time']=='after']
after.drop(columns='time', inplace=True)
after.rename(columns={'count':'after'}, inplace=True)

reg_df = before.merge(after, on=['cluster', 'party', 'gender', 'category', 'hashtag', 'peak', 'test_range'])
reg_df['diff'] = (reg_df['after'] - reg_df['before']) / reg_df['before']
In [108]:
std_df = suggestions_df.groupby(['cluster', 'party', 'gender'], as_index=False).std()
std_df['norm_std'] = (std_df['count'] - std_df['count'].min()) / (std_df['count'].max() - std_df['count'].min())
std_df.rename(columns={'count':'std'}, inplace=True)
std_df.head(3)
Out[108]:
cluster party gender std norm_std
0 0 AFD female 6.487775 0.074339
1 0 AFD male 4.986847 0.057141
2 0 CDU female 9.486265 0.108696
In [109]:
reg_df = reg_df.merge(std_df, on=['cluster', 'party', 'gender'], how='left')
reg_df.head(3)
Out[109]:
cluster party gender category before hashtag peak test_range after diff std norm_std
0 211 CDU female Politik 4.000 afdwählen 1497484800000 7 3.750 -0.062500 5.989453 0.068629
1 211 CDU male Politik 19.625 afdwählen 1497484800000 7 24.125 0.229299 4.775760 0.054722
2 211 CSU female Politik 4.000 afdwählen 1497484800000 7 4.000 0.000000 1.116071 0.012788
In [110]:
# regression for test range 1 week
reg = smf.ols('diff ~ C(party) + C(gender) + C(category) + C(test_range) + norm_std',
              data=reg_df[reg_df['test_range']==7]).fit()
reg.summary()
Out[110]:
OLS Regression Results
Dep. Variable: diff R-squared: 0.014
Model: OLS Adj. R-squared: 0.013
Method: Least Squares F-statistic: 10.05
Date: Mon, 24 Jan 2022 Prob (F-statistic): 9.36e-26
Time: 10:42:43 Log-Likelihood: -9412.2
No. Observations: 11408 AIC: 1.886e+04
Df Residuals: 11391 BIC: 1.898e+04
Df Model: 16
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept 0.1080 0.038 2.849 0.004 0.034 0.182
C(party)[T.CDU] -0.0603 0.030 -1.993 0.046 -0.120 -0.001
C(party)[T.CSU] -0.0782 0.033 -2.380 0.017 -0.143 -0.014
C(party)[T.DIE LINKE] 0.0676 0.033 2.074 0.038 0.004 0.131
C(party)[T.FDP] -0.0639 0.044 -1.443 0.149 -0.151 0.023
C(party)[T.GRÜNE] -0.1413 0.032 -4.382 0.000 -0.204 -0.078
C(party)[T.Parteilos] -0.1318 0.041 -3.192 0.001 -0.213 -0.051
C(party)[T.SPD] -0.0728 0.030 -2.436 0.015 -0.131 -0.014
C(party)[T.fraktionslos] -0.1030 0.067 -1.541 0.123 -0.234 0.028
C(gender)[T.male] -0.0168 0.013 -1.312 0.190 -0.042 0.008
C(category)[T.Medizin] -0.0438 0.065 -0.669 0.503 -0.172 0.084
C(category)[T.Organisationen] -0.0073 0.038 -0.194 0.846 -0.081 0.066
C(category)[T.Orte] 0.0791 0.027 2.956 0.003 0.027 0.132
C(category)[T.Personen] 0.0425 0.024 1.793 0.073 -0.004 0.089
C(category)[T.Politik] 0.0034 0.025 0.139 0.889 -0.045 0.052
C(category)[T.Wirtschaft] 0.0219 0.027 0.819 0.413 -0.030 0.074
norm_std 0.5225 0.175 2.979 0.003 0.179 0.866
Omnibus: 13742.275 Durbin-Watson: 1.878
Prob(Omnibus): 0.000 Jarque-Bera (JB): 2013831.093
Skew: 6.444 Prob(JB): 0.00
Kurtosis: 66.801 Cond. No. 47.2


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [111]:
# regression for test range 5 week
reg = smf.ols('diff ~ C(party) + C(gender) + C(category) + C(test_range) + norm_std',
              data=reg_df[reg_df['test_range']==35]).fit()
reg.summary()
Out[111]:
OLS Regression Results
Dep. Variable: diff R-squared: 0.036
Model: OLS Adj. R-squared: 0.034
Method: Least Squares F-statistic: 26.79
Date: Mon, 24 Jan 2022 Prob (F-statistic): 1.29e-79
Time: 10:43:35 Log-Likelihood: -11598.
No. Observations: 11623 AIC: 2.323e+04
Df Residuals: 11606 BIC: 2.336e+04
Df Model: 16
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept 0.1050 0.045 2.354 0.019 0.018 0.192
C(party)[T.CDU] -0.2217 0.036 -6.171 0.000 -0.292 -0.151
C(party)[T.CSU] -0.2537 0.039 -6.546 0.000 -0.330 -0.178
C(party)[T.DIE LINKE] 0.0736 0.039 1.904 0.057 -0.002 0.149
C(party)[T.FDP] -0.2741 0.051 -5.412 0.000 -0.373 -0.175
C(party)[T.GRÜNE] -0.2516 0.038 -6.572 0.000 -0.327 -0.177
C(party)[T.Parteilos] -0.3542 0.049 -7.218 0.000 -0.450 -0.258
C(party)[T.SPD] -0.1666 0.036 -4.693 0.000 -0.236 -0.097
C(party)[T.fraktionslos] -0.1204 0.079 -1.515 0.130 -0.276 0.035
C(gender)[T.male] 0.1127 0.015 7.464 0.000 0.083 0.142
C(category)[T.Medizin] -0.0417 0.077 -0.538 0.591 -0.194 0.110
C(category)[T.Organisationen] 0.1765 0.043 4.088 0.000 0.092 0.261
C(category)[T.Orte] 0.2520 0.031 8.082 0.000 0.191 0.313
C(category)[T.Personen] 0.1200 0.027 4.379 0.000 0.066 0.174
C(category)[T.Politik] 0.0929 0.028 3.267 0.001 0.037 0.149
C(category)[T.Wirtschaft] 0.0031 0.031 0.100 0.921 -0.058 0.064
norm_std 0.5348 0.208 2.577 0.010 0.128 0.942
Omnibus: 11168.564 Durbin-Watson: 1.828
Prob(Omnibus): 0.000 Jarque-Bera (JB): 640625.678
Skew: 4.632 Prob(JB): 0.00
Kurtosis: 38.171 Cond. No. 47.4


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [112]:
# regression for test range 9 week
reg = smf.ols('diff ~ C(party) + C(gender) + C(category) + C(test_range) + norm_std',
              data=reg_df[reg_df['test_range']==63]).fit()
reg.summary()
Out[112]:
OLS Regression Results
Dep. Variable: diff R-squared: 0.081
Model: OLS Adj. R-squared: 0.080
Method: Least Squares F-statistic: 65.45
Date: Mon, 24 Jan 2022 Prob (F-statistic): 1.51e-203
Time: 10:43:40 Log-Likelihood: -12438.
No. Observations: 11881 AIC: 2.491e+04
Df Residuals: 11864 BIC: 2.504e+04
Df Model: 16
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept 0.0954 0.047 2.041 0.041 0.004 0.187
C(party)[T.CDU] -0.2978 0.038 -7.911 0.000 -0.372 -0.224
C(party)[T.CSU] -0.3614 0.040 -8.972 0.000 -0.440 -0.282
C(party)[T.DIE LINKE] 0.2082 0.040 5.153 0.000 0.129 0.287
C(party)[T.FDP] -0.5203 0.050 -10.318 0.000 -0.619 -0.421
C(party)[T.GRÜNE] -0.3211 0.040 -8.036 0.000 -0.399 -0.243
C(party)[T.Parteilos] -0.4848 0.051 -9.418 0.000 -0.586 -0.384
C(party)[T.SPD] -0.2278 0.037 -6.117 0.000 -0.301 -0.155
C(party)[T.fraktionslos] -0.0997 0.083 -1.195 0.232 -0.263 0.064
C(gender)[T.male] 0.1885 0.016 12.101 0.000 0.158 0.219
C(category)[T.Medizin] -0.0411 0.081 -0.506 0.613 -0.201 0.118
C(category)[T.Organisationen] 0.4531 0.045 9.996 0.000 0.364 0.542
C(category)[T.Orte] 0.2816 0.033 8.614 0.000 0.218 0.346
C(category)[T.Personen] 0.1706 0.029 5.949 0.000 0.114 0.227
C(category)[T.Politik] 0.1136 0.030 3.819 0.000 0.055 0.172
C(category)[T.Wirtschaft] -0.0207 0.033 -0.632 0.528 -0.085 0.043
norm_std 0.7421 0.216 3.434 0.001 0.318 1.166
Omnibus: 8771.777 Durbin-Watson: 1.938
Prob(Omnibus): 0.000 Jarque-Bera (JB): 201534.726
Skew: 3.342 Prob(JB): 0.00
Kurtosis: 22.037 Cond. No. 47.5


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.